The files are available in https://github.com/v0369012/Pokemon_PCA.
# Read the Pokemon value list
PKM_values_7 <- readLines("Pokemon_list_g7.txt", encoding = "UTF-8")
To simplify the analysis, we removed Pokemons with special form, like mega, Alolan…
# Loading packages
library(tidyverse)
## -- Attaching packages ----------------------------------------------------------- tidyverse 1.2.1 --
## √ ggplot2 3.2.1 √ purrr 0.3.3
## √ tibble 2.1.3 √ dplyr 0.8.3
## √ tidyr 1.0.0 √ stringr 1.4.0
## √ readr 1.3.1 √ forcats 0.4.0
## -- Conflicts -------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
# Pokemons without specail form
PKM_without_spf <- str_count(PKM_values_7, "\\|") == 8 # Pokemons without specail form
PKM_values_7_without_spf <- PKM_values_7[PKM_without_spf]
PKM_values_7_without_spf_split <- str_split(PKM_values_7_without_spf, "\\|")
# Make a Pokemon table
PKM_values_7_without_spf_df <- data.frame(
Number = unlist(PKM_values_7_without_spf_split)[seq(2, 792*9, 9)],
Name = unlist(PKM_values_7_without_spf_split)[seq(3, 792*9, 9)],
generation = c(rep(1, 151-0), rep(2, 251-151), rep(3, 385-251), rep(4, 490-385), rep(5, 640-490), rep(6, 707-640), rep(7, 792-707)),
HP = unlist(PKM_values_7_without_spf_split)[seq(4, 792*9, 9)] %>% as.character() %>% as.numeric(),
ATK = unlist(PKM_values_7_without_spf_split)[seq(5, 792*9, 9)] %>% as.character() %>% as.numeric(),
DEF = unlist(PKM_values_7_without_spf_split)[seq(6, 792*9, 9)] %>% as.character() %>% as.numeric(),
SATK = unlist(PKM_values_7_without_spf_split)[seq(7, 792*9, 9)] %>% as.character() %>% as.numeric(),
SDEF = unlist(PKM_values_7_without_spf_split)[seq(8, 792*9, 9)] %>% as.character() %>% as.numeric(),
SPEED = unlist(PKM_values_7_without_spf_split)[seq(9, 792*9, 9)] %>% str_replace_all("\\}","") %>% as.character() %>% as.numeric()
)
# Check the table
head(PKM_values_7_without_spf_df)
## Number Name generation HP ATK DEF SATK SDEF SPEED
## 1 001 妙蛙种子 1 45 49 49 65 65 45
## 2 002 妙蛙草 1 60 62 63 80 80 60
## 3 003 妙蛙花 1 80 82 83 100 100 80
## 4 004 小火<U+9F99> 1 39 52 43 60 50 65
## 5 005 火恐<U+9F99> 1 58 64 58 80 65 80
## 6 006 <U+55B7>火<U+9F99> 1 78 84 78 109 85 100
# Address types table
PKM_types_7 <- readLines("Pokemon_types.txt")
PKM_types_7_number <- c()
for (i in 1:876) {
PKM_types_7_number[i] <- str_split(PKM_types_7, "\\|")[[i]][[3]]
}
# Remove Pokemon number containing letters
position_without_letters <- str_detect(PKM_types_7_number, "^[0-9]*$")
PKM_types_7_number_without_letters <- PKM_types_7_number[position_without_letters]
PKM_types_7_name <- c()
for (i in 1:876) {
PKM_types_7_name[i] <- str_split(PKM_types_7, "\\|")[[i]][[4]]
}
PKM_types_7_types1 <- c()
for (i in 1:876) {
PKM_types_7_types1[i] <- str_split(PKM_types_7, "\\|")[[i]][[6]] %>% str_remove_all("\\}")
}
# Make Pokemon types 1 table
PKM_types_7_df <- data.frame(
Number = PKM_types_7_number,
Name = PKM_types_7_name,
types1 = PKM_types_7_types1
)
# Check the table
head(PKM_types_7_df)
## Number Name types1
## 1 001 Bulbasaur Grass
## 2 002 Ivysaur Grass
## 3 003 Venusaur Grass
## 4 004 Charmander Fire
## 5 005 Charmeleon Fire
## 6 006 Charizard Fire
# Remove number with letters
PKM_types_7_df_t <- filter(PKM_types_7_df, Number %in% PKM_types_7_number_without_letters)
# Merge Pokemon table and types table by number
PKM_merged_df <- merge(PKM_types_7_df_t, PKM_values_7_without_spf_df, by = "Number")
# Remove Chinese names
PKM_merged_df <- PKM_merged_df[,-4]
colnames(PKM_merged_df)[2] <- "Name"
# Check the correlation between the variables
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
head(melt(cor(PKM_merged_df[, 5:ncol(PKM_merged_df)])))
## Var1 Var2 value
## 1 HP HP 1.0000000
## 2 ATK HP 0.4355205
## 3 DEF HP 0.2326511
## 4 SATK HP 0.3789334
## 5 SDEF HP 0.3630897
## 6 SPEED HP 0.1579874
# Plot heatmap
ggplot(melt(cor(PKM_merged_df[, 5:ncol(PKM_merged_df)])),
aes(Var1, Var2)) +
geom_tile(aes(fill = value), colour = "white") +
scale_fill_gradient2(low = "firebrick4", high = "steelblue",
mid = "white", midpoint = 0) +
guides(fill=guide_legend(title="Correlation")) +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
axis.title = element_blank())
# PCA
pca.model <- prcomp(PKM_merged_df[, 5:ncol(PKM_merged_df)], T)
# Show pca summary
summary(pca.model)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 45.3401 30.5620 26.2863 22.890 18.60068 14.5243
## Proportion of Variance 0.4317 0.1962 0.1451 0.110 0.07266 0.0443
## Cumulative Proportion 0.4317 0.6279 0.7730 0.883 0.95570 1.0000
pca.model$rotation
## PC1 PC2 PC3 PC4 PC5 PC6
## HP 0.3659171 0.01407945 -0.12663530 -0.79582535 0.3333136 -0.3247060
## ATK 0.4641571 -0.09316829 -0.71487140 0.05861969 -0.1662638 0.4834841
## DEF 0.4194666 -0.64774595 0.04716237 0.36935249 -0.0344900 -0.5144296
## SATK 0.4636844 0.39081599 0.33978416 -0.08918610 -0.7063606 -0.0995336
## SDEF 0.4208148 -0.12777920 0.57761282 0.03901981 0.3998128 0.5581917
## SPEED 0.2874409 0.63441466 -0.14699745 0.46618427 0.4486350 -0.2732882
# Make a pca table to plot
p1_p2_table <- pca.model$x[,1:2] %>% as.data.frame()
rownames(p1_p2_table) <- PKM_merged_df[,1]
legend_number <- c(144:146, 150:151,
243:245, 249:251,
377:386,
479:494,
638:649,
716:721,
785:809
)
legend <- rep(F, nrow(p1_p2_table))
legend_position <- which(PKM_merged_df[, "Number"] %in% legend_number)
legend[legend_position] <- rep(T, length(legend_position))
p1_p2_table_t <- cbind(p1_p2_table,
Number = PKM_merged_df[, "Number"],
Name = PKM_merged_df[, "Name"],
generation = PKM_merged_df["generation"],
types1 = PKM_merged_df[, "types1"],
legend = legend
)
p1_p2_table_t[, "generation"] <- as.character(p1_p2_table_t[, "generation"])
# Visualization
pca_gg <- ggplot(data = p1_p2_table_t, aes(x=PC1, y=PC2, label = Number))+
geom_point(size = 2.5)
# label with generation
pca_gg_generation <- ggplot(data = p1_p2_table_t, aes(x=PC1, y=PC2, label = Number, color = generation))+
geom_point(size = 2.5)
# label with first types
pca_gg_types1 <- ggplot(data = p1_p2_table_t, aes(x=PC1, y=PC2, label = Number, color = types1))+
geom_point(size = 2.5)
# label with legend Pokemon
pca_gg_legend <- ggplot(data = p1_p2_table_t, aes(x=PC1, y=PC2, label = Number, color = legend))+
geom_point(size = 2.5)
# User-interactive visualization
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
pca_ggly <- ggplotly(pca_gg)
pca_ggly
pca_ggly_generation <- ggplotly(pca_gg_generation)
pca_ggly_generation
pca_ggly_types1 <- ggplotly(pca_gg_types1)
pca_ggly_types1
pca_ggly_legend <- ggplotly(pca_gg_legend)
pca_ggly_legend